keep = ls()

#Perform record linkage if matches do not exist:
run_fastlink = !(file.exists("./cleaned/wi_cwdb_tp_matches_raw.csv",
                "./cleaned/ia_cwdb_tp_matches_raw.csv") %>% all)

##########################
#Load Township Master Key#
##########################

master_key = fread('./raw/IA_WI_TOWNSHIPS_MASTER_KEY.csv')
master_county = master_key[, list(State, County, township_id)] %>% unique
#Unlist parent townships
master_list = master_key[, list(State, state_id, County, county_id, Township, township_id, Parent_Township)] %>% unique %>%
  .[, list( parent_tag = Parent_Township %>% str_split("; ?") %>% unlist), by = list(State, state_id, County, county_id, Township, township_id)]
master_list[, parent_name := parent_tag %>% str_extract("^[^:]+")]
master_list[, parent_year := parent_tag %>% str_extract("\\d+$")]
master_list[is.na(parent_name), parent_name := Township]
master_list[is.na(parent_year), parent_year := 1850]

#Merge in parent township IDs
parent_ids = master_key[, list(State, County, parent_name = Township, parent_id = township_id)] %>% unique

setkey(master_list, State, County, parent_name)
setkey(parent_ids, State, County, parent_name)

master_list = parent_ids[master_list] 
master_list[, setdiff(parent_name, Township), by= list(State, County)]

#Create network of townships
edges = master_list[parent_year >= 1857 & ((State == "IOWA" & parent_year <=1868) | (State == "WISCONSIN" & parent_year <=1865)), list(to = township_id, from = parent_id)]

#add Green Lake/Marquette County edges
add_edges = c(22201,	23302,
              22201,	23302,
              22201,	23302,
              22202,	23303,
              22203,	23313,
              22205,	23307,
              22206,	23310,
              22207,	23311,
              22210,	23320,
              22212,	23320,
              23302,	22201,
              23303,	22202,
              23307,	22205,
              23310,	22206,
              23311,	22207,
              23313,	22203) %>% 
  matrix(byrow = T, ncol = 2) %>%
  as.data.table %>% unique %>%
  setnames(paste0("V", 1:2), c('to', 'from'))

edges = rbind(edges, add_edges)

vertices = data.table(master_list$township_id %>% unique)

township_graph = edges %>% graph_from_data_frame(directed = F, vertices = vertices)

#Get clusters of townships that share boundaries
groups = clusters(township_graph)

group_list = split(names(groups$membership), groups$membership)

township2cluster = mapply(function(x,y) data.table(township_id = as.numeric(x), township_cluster = y), 
                          group_list, names(group_list), SIMPLIFY = F) %>%
  rbindlist()

###########################################
#Merge Census data to WI/IA Election Units#
###########################################

#Get Census Data: 1860
######################
iowa = fread(paste0(ipums_census_path,'/iowa_1860.txt'))
wisconsin = fread(paste0(ipums_census_path,'/wisconsin_1860.txt'))

#Select white males over 10 years of age in 1860
iowa = iowa[Gender %in% "Male" &
            as.numeric(ResidenceAge) >= 10 &
            Race %in% c("", "White")]

wisconsin = wisconsin[Gender %in% "Male" &
              as.numeric(ResidenceAge) >= 10 &
              Race %in% c("", "White")]

ipums_census = rbind(iowa, wisconsin)
ipums_census[, birth_year := 1860 - as.numeric(ResidenceAge)]

#Link Census townships to Master Key
####################################

#Crosswalk linking IPUMS 1860 Census towns to Township IDs
ipums_tp_key = fread('./raw/ipums_census_names_1860.csv')

setkey(ipums_tp_key, census_state, census_county, census_city)
setkey(ipums_census, State, County, City)

ipums_census = ipums_tp_key[ipums_census]

#Link Townships to Election Units
#################################

iowa = ipums_census[census_state %in% "Iowa"]
wisconsin = ipums_census[census_state %in% "Wisconsin"]

#Load election unit crosswalk
iowa_election_key =fread('./cleaned/iowa_elections_tp_xwalk.csv')
wisconsin_election_key = fread('./cleaned/wisconsin_elections_tp_xwalk.csv') %>% unique
iowa_election_key[, township_id := as.character(tp_id)]
wisconsin_election_key[, township_id := as.character(tp_id)]

#Get Iowa eligible voters, military aged males
setkey(iowa, township_id)
setkey(iowa_election_key, township_id)

iowa = iowa_election_key[iowa]
ia_census = iowa[, list(mil_age_males = .N, 
                        elig_1857 = sum(birth_year <= (1857 - 21), na.rm = T),
                        elig_1868 = sum(birth_year <= (1868 - 21), na.rm = T)) ,
                 by = ia_unit_id]

#Get Wisconsin eligible voters, military aged males
setkey(wisconsin, township_id)
setkey(wisconsin_election_key, township_id)

wisconsin = wisconsin_election_key[wisconsin]
wi_census = wisconsin[, list(mil_age_males = .N, 
                             elig_1857 = sum(birth_year <= (1857 - 21), na.rm = T),
                             elig_1865 = sum(birth_year <= (1865 - 21), na.rm = T)) , by = wi_unit_id]

#Get Census Data: 1870
######################
iowa_1870 = fread(paste0(ipums_census_path,'/iowa_1870.txt')) %>%
  .[Gender %in% "Male" &
    as.numeric(ResidenceAge) >= 20 &
    Race %in% c("White", "")]
wisconsin_1870 = fread(paste0(ipums_census_path,'/wisconsin_1870.txt')) %>%
  .[Gender %in% "Male" &
      as.numeric(ResidenceAge) >= 20 &
      Race %in% c("White", "")]

ipums_census_1870 = rbind(iowa_1870, wisconsin_1870)
ipums_census_1870[, birth_year := 1870 - as.numeric(ResidenceAge)]

#Link to township ids
ipums_1870_tp_key = fread('./raw/ipums_census_names_1870.csv')

setkey(ipums_1870_tp_key, census_state, census_county, census_township)
setkey(ipums_census_1870, State, County, Township)
ipums_census_1870 = ipums_1870_tp_key[ipums_census_1870]
ipums_census_1870[, township_id := as.character(township_id)]

#Collapse to election units
iowa_1870 = ipums_census_1870[census_state %in% "Iowa"]
wisconsin_1870 = ipums_census_1870[census_state %in% "Wisconsin"]

#Iowa military aged males, eligible voters from 1870 census
setkey(iowa_1870, township_id)
setkey(iowa_election_key, township_id)

iowa_1870 = iowa_election_key[iowa_1870]
ia_census_1870 = iowa_1870[, list(mil_age_males_70 = .N, 
                        elig_1857_70 = sum(birth_year <= (1857 - 21), na.rm = T),
                        elig_1868_70 = sum(birth_year <= (1868 - 21), na.rm = T)) ,
                 by = ia_unit_id]

#Wisconsin military aged males, eligible voters from 1870 census
setkey(wisconsin_1870, township_id)
setkey(wisconsin_election_key, township_id)

wisconsin_1870 = wisconsin_election_key[wisconsin_1870]
wi_census_1870 = wisconsin_1870[, list(mil_age_males_70 = .N, 
                             elig_1857_70 = sum(birth_year <= (1857 - 21), na.rm = T),
                             elig_1865_70 = sum(birth_year <= (1865 - 21), na.rm = T)) , by = wi_unit_id]

##############################
#Merge CWDB to Election Units#
##############################
survivedwar = fread(paste0(cwdb_path, '/person_table.csv'), 
                    select = c("personpk", "survivedwar")) %>%
              .[, survivedwar := survivedwar %in% "Y"]
muster_out = fread(paste0(cwdb_path, '/regiment_roster_table.csv'), select = c("personpk", 'outdate')) %>%
              .[, outdate := strptime(outdate, format = "%Y-%m-%d") %>% as.Date] %>%
              .[, list(outdate = max(outdate)), by = personpk] %>%
              .[outdate < as.Date("1865-10-07")]


cwdb_to_match = fread(paste0(cwdb_path, '/person_table.csv'), 
                      select = c('personpk', 'lastname', 'firstname', 'midname', 'residence', 'state', 'stateserved', 'enlistage', 'enlistdate'))
setnames(cwdb_to_match, c('residence', 'state'), c("residence_place", "residence_state"))
cwdb_to_match = cwdb_to_match[residence_state %in% c("IA", "WI") | (stateserved %in% c("IA", "WI") & residence_state %in% c("IA", "WI", ""))]
cwdb_to_match[, enlist_year := as.POSIXct(enlistdate, format = "%Y-%m-%d") %>% year()]
cwdb_to_match[, birth_year := enlist_year - enlistage]

#Crosswalk linking CWDB residence places to Township ID
cwdb_township_xwalk = fread("./raw/cwdb_township_crosswalk.csv")
cwdb_not_in_state = fread("./raw/cwdb_not_in_state.csv")

setkey(cwdb_township_xwalk, state, residence)
setkey(cwdb_not_in_state, state, residence)

cwdb_township_xwalk[, not_in_state := 0]
cwdb_township_xwalk[J(cwdb_not_in_state[, 1:2]), not_in_state := 1]

#Fix residence state
ia_blanks = c("Kossuth County",
              "Marietta",
              "Moscow",
              "Mount Vernon",
              "New Hope",
              "Ontario",
              "Paris",
              "Pariss",
              "Pella",
              "Richard Center",
              "Scandanavia",
              "Waterloo")

cwdb_to_match[residence_state %in% "" & 
           residence_place %in% ia_blanks, 
         residence_state := "IA"]

#Merge in township id to soldiers table
setkey(cwdb_to_match, residence_state, residence_place)
setkey(cwdb_township_xwalk, state, residence)

linked_cwdb = cwdb_township_xwalk[cwdb_to_match, allow.cartesian =T]
linked_cwdb[, township_id := as.character(matches)]

#Wisconsin linked soldiers
wi_linked_cwdb = linked_cwdb[state %in% "WI" & not_in_state == 0]
setkey(wi_linked_cwdb, township_id)
setkey(wisconsin_election_key, township_id)

wi_linked_cwdb = wisconsin_election_key[wi_linked_cwdb]
wi_linked_cwdb[, match_n := unique(wi_unit_id) %>% length, by = personpk]

#Iowa linked soldiers
ia_linked_cwdb = linked_cwdb[state %in% "IA" & not_in_state == 0]
setkey(ia_linked_cwdb, township_id)
setkey(iowa_election_key, township_id)

ia_linked_cwdb = iowa_election_key[ia_linked_cwdb]
ia_linked_cwdb[, match_n := unique(ia_unit_id) %>% length, by = personpk]


###################################
#Get IA/WI wartime experience data#
###################################

cwdb_treatment_data = fread('./cleaned/cwdb_treatment_data_ia_wi.csv')

#############################
#Prepare for census matching#
#############################

#clean cwdb names
wi_linked_cwdb[, paste("match", 
                       c("first", "middle", "last", "first_clean"), 
                       sep = "_") := clean_names(first = firstname,
                                                 middle = midname,
                                                 last = lastname
                                                 )]
ia_linked_cwdb[, paste("match", 
                       c("first", "middle", "last", "first_clean"), 
                       sep = "_") := clean_names(first = firstname,
                                                 middle = midname,
                                                 last = lastname
                       )]

ia_cwdb_to_match = ia_linked_cwdb[, list(personpk, 
                                         match_first, match_first_clean, match_middle, match_last,
                                         birth_year = as.numeric(birth_year), 
                                         ia_unit_id, township_id)] %>% unique

wi_cwdb_to_match = wi_linked_cwdb[, list(personpk, 
                                         match_first, match_first_clean, match_middle, match_last,
                                         birth_year, 
                                         wi_unit_id, township_id)] %>% unique

#clean census names
wisconsin[, paste("match", 
                       c("first", "middle", "last", "first_clean"), 
                       sep = "_") := clean_names(first = Given,
                                                 last = Surname
                       )]
wisconsin[, birth_year := 1860 - as.numeric(ResidenceAge)]

iowa[, paste("match", 
                  c("first", "middle", "last", "first_clean"), 
                  sep = "_") := clean_names(first = Given,
                                            last = Surname
                  )]
iowa[, birth_year := 1860 - as.numeric(ResidenceAge)]

wi_census_to_match = wisconsin[!is.na(wi_unit_id), list(mpcid, 
                                      match_first, match_first_clean, match_middle, match_last,
                                      birth_year, 
                                      wi_unit_id, township_id
                                      )]

ia_census_to_match = iowa[!is.na(ia_unit_id), list(mpcid, 
                                      match_first, match_first_clean, match_middle, match_last,
                                      birth_year = as.numeric(birth_year), 
                                      ia_unit_id, township_id
)]

#add township clusters
township2cluster[, township_id := as.character(township_id)]
setkey(township2cluster, township_id)
setkey(wi_cwdb_to_match, township_id)
setkey(wi_census_to_match, township_id)
setkey(ia_cwdb_to_match, township_id)
setkey(ia_census_to_match, township_id)

wi_cwdb_to_match = township2cluster[wi_cwdb_to_match]
wi_census_to_match = township2cluster[wi_census_to_match]
ia_cwdb_to_match = township2cluster[ia_cwdb_to_match]
ia_census_to_match = township2cluster[ia_census_to_match]

###########################
#Link to Census: Wisconsin#
###########################

if (run_fastlink) {
  #Unique election districts
  unique_districts = wi_census_to_match$wi_unit_id %>% unique %>% sort
  
  #Get blocks by election district matching
  blocks_cwdb = lapply(unique_districts, function (x) which(wi_cwdb_to_match$wi_unit_id %in% x))
  blocks_census = lapply(unique_districts, function (x) which(wi_census_to_match$wi_unit_id %in% x))
  
  #First pass to get counts
  m_out = matches_list_2 = matches_list_1 = vector(mode = 'list', length = length(blocks_cwdb))
  
  #Get matches by 
  for (i in 1:length(matches_list_1)) {
    if ((length(blocks_cwdb[[i]]) > 1) & (length(blocks_census[[i]]) > 1)) {
      
      print(i)
      out = try(
        fastLink(dfA = wi_cwdb_to_match[blocks_cwdb[[i]],],
                 dfB = wi_census_to_match[blocks_census[[i]],],
                 varnames = c("match_first_clean", "match_last", 'match_middle', 'township_cluster'),
                 stringdist.match = c("match_first_clean", "match_last", 'match_middle'),
                 #numeric.match = c('birth_year'),
                 partial.match = c("match_first_clean", "match_last", 'match_middle'),
                 cut.a.num = 1.5,
                 cut.p.num = 3.5,
                 cut.a = 0.94,
                 cut.p = 0.85,
                 n.cores = 7,
                 cond.indep = T,
                 estimate.only = T
        )
      )
      #if error, drop middle initial
      if (inherits(out, "try-error")) {
        out = fastLink(dfA = wi_cwdb_to_match[blocks_cwdb[[i]],],
                       dfB = wi_census_to_match[blocks_census[[i]],],
                       varnames = c("match_first_clean", "match_last", 'township_cluster'),
                       stringdist.match = c("match_first_clean", "match_last"),
                       #numeric.match = c('birth_year'),
                       partial.match = c("match_first_clean", "match_last"),
                       cut.a.num = 1.5,
                       cut.p.num = 3.5,
                       cut.a = 0.94,
                       cut.p = 0.85,
                       n.cores = 7,
                       cond.indep = T,
                       estimate.only = T
        )
        
      }
      matches_list_1[[i]] = out
    }
  }


#Get all counts and estimate EM for all blocks
all_counts = lapply(matches_list_1, function(x) x$patterns.w %>% as.data.table) %>% 
  lapply(function(x) if (!("gamma.4" %in% names(x)) & ("gamma.3" %in% names(x))) 
  {x[, c("gamma.3","gamma.4") := list(NA, gamma.3)]} 
  else {x}) %>%
  rbindlist(fill = T) %>%
  .[, list(gamma.1, gamma.2, gamma.3, gamma.4, counts)] %>%
  .[, list(counts = sum(counts, na.rm = T)), by = list(gamma.1, gamma.2, gamma.3, gamma.4)]

obs.a = blocks_cwdb %>% unlist %>% unique %>% 
  wi_cwdb_to_match[., personpk] %>% unique %>% length
obs.b = blocks_census %>% unlist %>% unique %>% 
  wi_census_to_match[., mpcid] %>% unique %>% length

testEM = emlinkMARmov(as.data.frame(all_counts), 
                      nobs.a = obs.a, nobs.b = obs.b, 
                      varnames = c("match_first_clean", "match_last", 'match_middle', 'township_cluster'))

#Get Matches
for (i in 1:length(matches_list_2)) {
  print(i)
  if ((length(blocks_cwdb[[i]]) > 1) & (length(blocks_census[[i]]) > 1)) {
    out = my_fastLink(dfA = wi_cwdb_to_match[blocks_cwdb[[i]],],
                     dfB = wi_census_to_match[blocks_census[[i]],],
                     varnames = c("match_first_clean", "match_last", 'match_middle', 'township_cluster'),
                     stringdist.match = c("match_first_clean", "match_last", 'match_middle'),
                     #numeric.match = c('birth_year'),
                     partial.match = c("match_first_clean", "match_last", 'match_middle'),
                     cut.a.num = 1.5,
                     cut.p.num = 3.5,
                     cut.a = 0.94,
                     cut.p = 0.85,
                     n.cores = 7,
                     em.obj = testEM, 
                     estimate.only = F,
                     threshold.match = 0.01,
                     dedupe.matches = F,
                     linprog.dedupe = F
                    )
    
    matches_list_2[[i]] = out
    m_out[[i]] = getMatches(wi_cwdb_to_match[blocks_cwdb[[i]],],
                            wi_census_to_match[blocks_census[[i]],],
                            out, threshold.match = 0.01) %>% as.data.table
  }
}


#Combine matches from all election units
m_out = lapply(m_out, function(x) if (!("gamma.4" %in% names(x)) & ("gamma.3" %in% names(x))) {x[, c("gamma.3","gamma.4") := list(NA, gamma.3)]} else {x}) %>% 
  rbindlist(fill =T)

fwrite(m_out, "./cleaned/wi_cwdb_tp_matches_raw.csv")
}

#Load matches
m_out = fread("./cleaned/wi_cwdb_tp_matches_raw.csv") %>% .[posterior > 0.5]
m_out[, mpcid := `dfB.match[, names.dfB]`]

#unique election unit matched
wi_matched_1 = wi_linked_cwdb[match_n == 1 & !is.na(matches), list(personpk, wi_unit_id, township_id)] %>% unique %>%
                .[, wt := 1/.N, by = personpk]
#multiple election unit matches
wi_matched_n = m_out[posterior > 0.5 &
                    !(personpk %in% wi_matched_1$personpk), 
                     list(wi_unit_id, township_id,
                          wt = posterior/sum(posterior)) , by = personpk] 

#Umatched: those who have location information, but otherwise unmatched
wi_unmatched = wi_linked_cwdb[!(personpk %in% c(wi_matched_1$personpk, wi_matched_n$personpk)), 
                              list(personpk, wi_unit_id, township_id, county_id = substr(township_id, 2, 3))] %>% 
                              unique %>%
                            .[!is.na(township_id)]

#Get 1860 township population to construct weights
wi_twp_pop = wisconsin[, list(pop = .N), by = list(wi_unit_id)]
setkey(wi_unmatched, wi_unit_id)
setkey(wi_twp_pop, wi_unit_id)

#Get unit of analysis population weights
wi_unmatched_twp = wi_twp_pop[wi_unmatched] %>%
                  .[, list(personpk, wi_unit_id, pop)] %>% 
                  unique %>%
                  .[, list(wi_unit_id,
                           wt = pop/sum(pop, na.rm = T)), 
                    by = list(personpk)]

#township enlistment
wi_tp_vets = rbind(wi_matched_1[, list(personpk, wi_unit_id, wt, type = 'use')], 
                   wi_matched_n[, list(personpk, wi_unit_id, wt, type = 'use')],
                   wi_unmatched_twp[, list(personpk, wi_unit_id, wt, type = 'alt')])

#Are veterans survived? Are they mustered out by one month before election time?
wi_tp_vets[, survived := personpk %in% survivedwar[(survivedwar), personpk]]
wi_tp_vets[, home := personpk %in% survivedwar[(survivedwar), personpk] &
                     personpk %in% muster_out[, personpk]]
#Merge in treatment data
setkey(wi_tp_vets, personpk)
setkey(cwdb_treatment_data, personpk)
wi_tp_vets = cwdb_treatment_data[wi_tp_vets]

#Aggregate veterans data to townships
wi_tp_vets = wi_tp_vets[, list(vets = sum(wt[type %in% 'use'], na.rm = T),
                  vets_alt = sum(wt[type %in% c('use', 'alt')], na.rm = T),
                  vets_alt_s = sum(wt[type %in% c('use', 'alt') & (survived)], na.rm = T), 
                  vets_alt_h = sum(wt[type %in% c('use', 'alt') & (survived) & (home)], na.rm = T), 
                  c_company_casualty_rate = weighted.mean(c_company_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                  c_company_kia_rate = weighted.mean(c_company_kia_rate[(survived)], wt[(survived)], na.rm = T),
                  c_company_kia = weighted.mean(c_company_kia[(survived)], wt[(survived)], na.rm = T),
                  #c_regiment_casualty_rate = weighted.mean(c_regiment_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                  c_combat_days = weighted.mean(c_combat_days[(survived)], wt[(survived)], na.rm = T),
                  c_usct_combat_days = weighted.mean(c_usct_combat_days[(survived)], wt[(survived)], na.rm = T),
                  c_battles = weighted.mean(c_battles[(survived)], wt[(survived)], na.rm = T),
                  c_usct_brigade_any = weighted.mean(c_usct_brigade_any[(survived)], wt[(survived)], na.rm = T)
                  ), by = wi_unit_id]

fwrite(wi_tp_vets, './cleaned/wi_twp_enlistment_cleaned.csv')


#Aggregate veterans data to counties

#Weight by size of township_cluster assigned to.
setkey(wisconsin, township_id)
setkey(township2cluster, township_id)
setkey(wi_unmatched, township_id)

wisconsin = township2cluster[wisconsin]
wi_unmatched = township2cluster[wi_unmatched]

#add county clusters
wisconsin[is.na(County_Cluster), County_Cluster := str_to_upper(ResidenceCounty)]
wisconsin[ResidenceCounty %in% "Polk", County_Cluster := "BURNETT;DOUGLAS;POLK"]

#Get township cluster pop to weight
wi_cty_pop = wisconsin %>%
             .[, list(pop = .N), by = list(township_cluster, County_Cluster)]

#Merge on township cluster; get person weights by township cluster
#summed by county cluster
setkey(wi_unmatched, township_cluster)
setkey(wi_cty_pop, township_cluster)
wi_unmatched_cty = wi_cty_pop[wi_unmatched] %>%
              .[!is.na(township_cluster), list(personpk, township_cluster, County_Cluster, pop)] %>% 
              unique %>%
              .[, list(township_cluster, County_Cluster,
                       wt = pop/sum(pop, na.rm = T)), 
                by = list(personpk)] %>%
              .[, list(wt = sum(wt, na.rm = T)), by = list(personpk, County_Cluster)] %>%
              .[!is.na(County_Cluster)]


#County enlistment
#Get township cluster for 1 and n matches:
wi_county_clusters = rbind(
                          wisconsin_election_key[, list(township_id, County_Cluster)],
                          wisconsin[, list(township_id, County_Cluster)]
                          )%>% unique
wi_matched_n[, township_id := as.character(township_id)]
setkey(wi_matched_1, township_id)
setkey(wi_matched_n, township_id)
setkey(wi_county_clusters, township_id)
wi_matched_1 = wi_county_clusters[wi_matched_1]
wi_matched_1[township_id %in% c(25002, 25004, 25006, 25007), County_Cluster := 'TREMPEALEAU']
wi_matched_n = wi_county_clusters[wi_matched_n]


#Compile county level enlistment
wi_county_vets = rbind(wi_matched_1[, list(personpk, County_Cluster, wt, type = 'use')], 
                       wi_matched_n[, list(personpk, County_Cluster, wt, type = 'use')],
                       wi_unmatched_cty[, list(personpk, County_Cluster, wt, type = 'alt')]) 

#Are veterans survived?
wi_county_vets[, survived := personpk %in% survivedwar[(survivedwar), personpk]]

#Merge in treatment data
setkey(wi_county_vets, personpk)
setkey(cwdb_treatment_data, personpk)
wi_county_vets = cwdb_treatment_data[wi_county_vets]

#Get twp veterans data
wi_county_vets = wi_county_vets[, list(vets = sum(wt[type %in% 'use'], na.rm = T),
                  vets_alt = sum(wt[type %in% c('use', 'alt')], na.rm = T),
                  vets_alt_s = sum(wt[type %in% c('use', 'alt') & (survived)], na.rm = T), 
                  c_company_casualty_rate = weighted.mean(c_company_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                  c_company_kia_rate = weighted.mean(c_company_kia_rate[(survived)], wt[(survived)], na.rm = T),
                  c_company_kia = weighted.mean(c_company_kia[(survived)], wt[(survived)], na.rm = T),
                  m_company_kia = weighted.mean(m_company_kia[(survived)], wt[(survived)], na.rm = T),
                  m_company_casualty_rate = weighted.mean(m_company_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                  #c_regiment_casualty_rate = weighted.mean(c_regiment_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                  c_combat_days = weighted.mean(c_combat_days[(survived)], wt[(survived)], na.rm = T),
                  c_usct_combat_days = weighted.mean(c_usct_combat_days[(survived)], wt[(survived)], na.rm = T),
                  c_battles = weighted.mean(c_battles[(survived)], wt[(survived)], na.rm = T),
                  c_usct_brigade_any = weighted.mean(c_usct_brigade_any[(survived)], wt[(survived)], na.rm = T)
), by = County_Cluster]

fwrite(wi_county_vets, './cleaned/wi_cty_enlistment_cleaned.csv')  

######################
#Link to Census: Iowa#
######################

if (run_fastlink) {
    #Unique election districts
    unique_districts = ia_census_to_match$ia_unit_id %>% unique %>% sort
    
    #Get blocks by election district matching
    blocks_cwdb = lapply(unique_districts, function (x) which(ia_cwdb_to_match$ia_unit_id %in% x))
    blocks_census = lapply(unique_districts, function (x) which(ia_census_to_match$ia_unit_id %in% x))
    
    #First pass to get counts
    m_out = matches_list_2 = matches_list_1 = vector(mode = 'list', length = length(blocks_cwdb))
    
    #Get matches by 
    for (i in 1:length(matches_list_1)) {
      if ((length(blocks_cwdb[[i]]) > 1) & (length(blocks_census[[i]]) > 1)) {
        
        print(i)
        out = try(
          fastLink(dfA = ia_cwdb_to_match[blocks_cwdb[[i]],],
                   dfB = ia_census_to_match[blocks_census[[i]],],
                   varnames = c("match_first_clean", "match_last", 'match_middle', "birth_year", 'township_cluster'),
                   stringdist.match = c("match_first_clean", "match_last", 'match_middle'),
                   numeric.match = c('birth_year'),
                   partial.match = c("match_first_clean", "match_last", 'match_middle', "birth_year"),
                   cut.a.num = 1.5,
                   cut.p.num = 3.5,
                   cut.a = 0.94,
                   cut.p = 0.85,
                   n.cores = 7,
                   cond.indep = T,
                   estimate.only = T
          )
        )
        #if error, drop middle initial
        if (inherits(out, "try-error")) {
          out = fastLink(dfA = ia_cwdb_to_match[blocks_cwdb[[i]],],
                         dfB = ia_census_to_match[blocks_census[[i]],],
                         varnames = c("match_first_clean", "match_last", "birth_year", 'township_cluster'),
                         stringdist.match = c("match_first_clean", "match_last"),
                         numeric.match = c('birth_year'),
                         partial.match = c("match_first_clean", "match_last", "birth_year"),
                         cut.a.num = 1.5,
                         cut.p.num = 3.5,
                         cut.a = 0.94,
                         cut.p = 0.85,
                         n.cores = 7,
                         cond.indep = T,
                         estimate.only = T
          )
          
        }
        matches_list_1[[i]] = out
      }
    }

#Get all counts and estimate EM for all blocks
all_counts = lapply(matches_list_1, function(x) x$patterns.w %>% as.data.table) %>% 
  lapply(function(x) if (!("gamma.5" %in% names(x)) & ("gamma.3" %in% names(x))) 
  {x[, c("gamma.3","gamma.4", 'gamma.5') := list(NA, gamma.3, gamma.4)]} 
  else {x}) %>%
  rbindlist(fill = T) %>%
  .[, list(gamma.1, gamma.2, gamma.3, gamma.4, gamma.5, counts)] %>%
  .[, list(counts = sum(counts, na.rm = T)), by = list(gamma.1, gamma.2, gamma.3, gamma.4, gamma.5)]

obs.a = blocks_cwdb %>% unlist %>% unique %>% 
  ia_cwdb_to_match[., personpk] %>% unique %>% length
obs.b = blocks_census %>% unlist %>% unique %>% 
  ia_census_to_match[., mpcid] %>% unique %>% length

testEM = emlinkMARmov(as.data.frame(all_counts), 
                      nobs.a = obs.a, nobs.b = obs.b, 
                      varnames = c("match_first_clean", "match_last", 'match_middle', 'birth_year', 'township_cluster'))

#Get Matches
for (i in 1:length(matches_list_2)) {
  print(i)
  if ((length(blocks_cwdb[[i]]) > 1) & (length(blocks_census[[i]]) > 1)) {
    out = my_fastLink(dfA = ia_cwdb_to_match[blocks_cwdb[[i]],],
                      dfB = ia_census_to_match[blocks_census[[i]],],
                      varnames = c("match_first_clean", "match_last", 'match_middle', "birth_year", 'township_cluster'),
                      stringdist.match = c("match_first_clean", "match_last", 'match_middle'),
                      numeric.match = c('birth_year'),
                      partial.match = c("match_first_clean", "match_last", 'match_middle', "birth_year"),
                      cut.a.num = 1.5,
                      cut.p.num = 3.5,
                      cut.a = 0.94,
                      cut.p = 0.85,
                      n.cores = 7,
                      em.obj = testEM, 
                      estimate.only = F,
                      threshold.match = 0.01,
                      dedupe.matches = F,
                      linprog.dedupe = F
    )
    
    matches_list_2[[i]] = out
    m_out[[i]] = getMatches(ia_cwdb_to_match[blocks_cwdb[[i]],],
                            ia_census_to_match[blocks_census[[i]],],
                            out, threshold.match = 0.01) %>% as.data.table
  }
}
  m_out = lapply(m_out, function(x) if (!("gamma.5" %in% names(x)) & ("gamma.3" %in% names(x))) {x[, c("gamma.3","gamma.4", 'gamma.5') := list(NA, gamma.3, gamma.4)]} else {x}) %>%
    rbindlist(fill = T)

  fwrite(m_out, "./cleaned/ia_cwdb_tp_matches_raw.csv")

}


#Load matches with probability > 0.5

m_out = fread("./cleaned/ia_cwdb_tp_matches_raw.csv") %>%
  .[posterior > 0.5]

#Match to one election unit
ia_matched_1 = ia_linked_cwdb[match_n == 1 & !is.na(matches), list(personpk, ia_unit_id, township_id)] %>% unique %>%
  .[, wt := 1/.N, by = personpk]

#Match to multiple election units
ia_matched_n = m_out[posterior > 0.5 &
                       !(personpk %in% ia_matched_1$personpk), 
                     list(ia_unit_id, township_id,
                          wt = posterior/sum(posterior)) , by = list(personpk)] %>%
  .[, list(wt = sum(wt)
           ), by = list(personpk, ia_unit_id, township_id)]

#Umatched to election units
ia_unmatched = ia_linked_cwdb[!(personpk %in% c(ia_matched_1$personpk, ia_matched_n$personpk)), 
                              list(personpk, ia_unit_id, township_id, county_id = substr(township_id, 2, 3))] %>% 
                              unique %>%
                              .[!is.na(township_id)]

#UNMATCHED to election unit: weight by election unit population
ia_twp_pop = iowa[, list(pop = .N), by = list(ia_unit_id)]
setkey(ia_unmatched, ia_unit_id)
setkey(ia_twp_pop, ia_unit_id)
ia_unmatched_twp = ia_twp_pop[ia_unmatched] %>%
  .[, list(personpk, ia_unit_id, pop)] %>% 
  unique %>%                 
  .[, list(ia_unit_id,
           wt = pop/sum(pop, na.rm = T)), 
    by = list(personpk)]
  
#Get county weights
ia_cty_pop = iowa[, list(mpcid, county_id = substr(township_id, 2,3))] %>%
  .[, list(pop = .N), by = list(county_id)]
setkey(ia_unmatched, county_id)
setkey(ia_cty_pop, county_id)
ia_unmatched_cty = ia_cty_pop[ia_unmatched] %>%
  .[, list(personpk, county_id, pop)] %>% 
  unique %>%
  .[, list(county_id,
           wt = pop/sum(pop, na.rm = T)), 
    by = list(personpk)]  
  

#Soldier-township links with weights
ia_tp_vets = rbind(ia_matched_1[, list(personpk, ia_unit_id, wt, type = 'use')], 
                   ia_matched_n[, list(personpk, ia_unit_id, wt, type = 'use')],
                   ia_unmatched_twp[, list(personpk, ia_unit_id, wt, type = 'alt')])

#Are veterans survived?
ia_tp_vets[, survived := personpk %in% survivedwar[(survivedwar), personpk]]

#Merge in treatment data
setkey(ia_tp_vets, personpk)
setkey(cwdb_treatment_data, personpk)
ia_tp_vets = cwdb_treatment_data[ia_tp_vets]

#Get twp veterans data
ia_tp_vets = ia_tp_vets[, list(vets = sum(wt[type %in% 'use'], na.rm = T),
                                       vets_alt = sum(wt[type %in% c('use', 'alt')], na.rm = T),
                                       vets_alt_s = sum(wt[type %in% c('use', 'alt') & (survived)], na.rm = T), 
                                       c_company_casualty_rate = weighted.mean(c_company_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                                       c_company_kia_rate = weighted.mean(c_company_kia_rate[(survived)], wt[(survived)], na.rm = T),
                                       c_company_kia = weighted.mean(c_company_kia[(survived)], wt[(survived)], na.rm = T),
                                       m_company_kia = weighted.mean(m_company_kia[(survived)], wt[(survived)], na.rm = T),
                                       m_company_casualty_rate = weighted.mean(m_company_casualty_rate[(survived)], wt[(survived)], na.rm = T), 
                                      #c_regiment_casualty_rate = weighted.mean(c_regiment_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                                       c_combat_days = weighted.mean(c_combat_days[(survived)], wt[(survived)], na.rm = T),
                                       c_usct_combat_days = weighted.mean(c_usct_combat_days[(survived)], wt[(survived)], na.rm = T),
                                       c_battles = weighted.mean(c_battles[(survived)], wt[(survived)], na.rm = T),
                                       c_usct_brigade_any = weighted.mean(c_usct_brigade_any[(survived)], wt[(survived)], na.rm = T)
                                      ), by = ia_unit_id]

fwrite(ia_tp_vets, './cleaned/ia_twp_enlistment_cleaned.csv')

#County Enlistment
ia_county_vets = rbind(ia_matched_1[, list(personpk, county_id = substr(township_id,2,3), wt, type = 'use')], 
                       ia_matched_n[, list(personpk, county_id = substr(township_id,2,3), wt, type = 'use')],
                       ia_unmatched_cty[, list(personpk, county_id, wt, type = 'alt')]) 

#Are veterans survived?
ia_county_vets[, survived := personpk %in% survivedwar[(survivedwar), personpk]]

#Merge in treatment data
setkey(ia_county_vets, personpk)
setkey(cwdb_treatment_data, personpk)
ia_county_vets = cwdb_treatment_data[ia_county_vets]

#Get twp veterans data
ia_county_vets = ia_county_vets[, list(vets = sum(wt[type %in% 'use'], na.rm = T),
                               vets_alt = sum(wt[type %in% c('use', 'alt')], na.rm = T),
                               vets_alt_s = sum(wt[type %in% c('use', 'alt') & (survived)], na.rm = T), 
                               c_company_casualty_rate = weighted.mean(c_company_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                               c_company_kia_rate = weighted.mean(c_company_kia_rate[(survived)], wt[(survived)], na.rm = T),
                               c_company_kia = weighted.mean(c_company_kia[(survived)], wt[(survived)], na.rm = T),
                               #c_regiment_casualty_rate = weighted.mean(c_regiment_casualty_rate[(survived)], wt[(survived)], na.rm = T),
                               c_combat_days = weighted.mean(c_combat_days[(survived)], wt[(survived)], na.rm = T),
                               c_usct_combat_days = weighted.mean(c_usct_combat_days[(survived)], wt[(survived)], na.rm = T),
                               c_battles = weighted.mean(c_battles[(survived)], wt[(survived)], na.rm = T),
                               c_usct_brigade_any = weighted.mean(c_usct_brigade_any[(survived)], wt[(survived)], na.rm = T)
), by = county_id]

#Add county names
ia_county_xwalk = master_county[State %in% "IOWA", list(county_id = substr(township_id,2,3), County)] %>% unique
setkey(ia_county_vets, county_id)
setkey(ia_county_xwalk, county_id)
ia_county_vets = ia_county_xwalk[ia_county_vets]

fwrite(ia_county_vets, './cleaned/ia_cty_enlistment_cleaned.csv')  

####################################
#Merge Census and CWDB to elections#
####################################

#Load elections
wi_elections = fread('./cleaned/wisconsin_elections_tp_file.csv')
ia_elections = fread('./cleaned/iowa_elections_tp_file.csv')

#Prepare merge, IOWA
setkey(ia_elections, ia_unit_id)
setkey(ia_census, ia_unit_id)
setkey(ia_census_1870, ia_unit_id)
setkey(ia_tp_vets, ia_unit_id)

ia_prelim = ia_census[ia_elections] %>% ia_census_1870[.] %>% ia_tp_vets[.]

#Prepare merge, Wisconsin
setkey(wi_elections, wi_unit_id)
setkey(wi_census, wi_unit_id)
setkey(wi_census_1870, wi_unit_id)
setkey(wi_tp_vets, wi_unit_id)

wi_prelim = wi_census[wi_elections] %>% wi_census_1870[.] %>% wi_tp_vets[.]

fwrite(ia_prelim, './cleaned/ia_twp_analysis_cleaned.csv')
fwrite(wi_prelim, './cleaned/wi_twp_analysis_cleaned.csv')

#Cleanup
rm(list = setdiff(ls(), c(keep, 'keep')))
